RUNTIME:
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(varian)
## Loading required package: rstan
## Loading required package: StanHeaders
## Loading required package: ggplot2
## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks rstan::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.84 loaded
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.0-2
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(broom)
library(knitr)
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(RColorBrewer)
features_all_modalities = function(fn, win_size, shift) {
df_all = read.csv(fn) %>%
mutate(ACC_chest_3D = sqrt(ACC_chest_X^2+ACC_chest_Y^2+ACC_chest_Z^2)) %>%
mutate(ACC_wrist_3D = sqrt(ACC_wrist_x^2+ACC_wrist_y^2+ACC_wrist_z^2))
drops <- c("X", "Label", "subject")
df = df_all[ , !(names(df_all) %in% drops)]
replace_rows = length(rollapply(df[,1], width = win_size*4, by = shift, FUN = mean, align = "left"))
features_df <- data.frame(matrix(ncol = ncol(df)*4, nrow = replace_rows))
new_names = sapply(1:length(df), function(c) {
c(paste0(colnames(df)[c],"_mean"),
paste0(colnames(df)[c],"_sd"),
#paste0(colnames(df)[c],"_range"),
paste0(colnames(df)[c],"_min"),
paste0(colnames(df)[c],"_max")
# paste0(colnames(df)[c],"_skew")
)
})
colnames(features_df) = new_names
for (c in 1:length(df)) {
# finding mu
mu_vals = rollapply(df[,c], width = win_size*4, by = shift, FUN = mean, align = "left")
new_col_name = paste0(colnames(df)[c],"_mean")
cindx = which(colnames(features_df)==new_col_name)
features_df[, cindx] = mu_vals
# finding sd
sd_vals = rollapply(df[,c], width = win_size*4, by = shift, FUN = var, align = "left")
new_col_name = paste0(colnames(df)[c],"_sd")
cindx = which(colnames(features_df)==new_col_name)
features_df[, cindx] = sqrt(sd_vals)
# finding max
max_vals = rollapply(df[,c], width = win_size*4, by = shift, FUN = max, align = "left")
new_col_name = paste0(colnames(df)[c],"_max")
cindx = which(colnames(features_df)==new_col_name)
features_df[, cindx] = max_vals
# finding min
min_vals = rollapply(df[,c], width = win_size*4, by = shift, FUN = min, align = "left")
new_col_name = paste0(colnames(df)[c],"_min")
cindx = which(colnames(features_df)==new_col_name)
features_df[, cindx] = min_vals
}
# make sure merge label, subject back into feature dataframe
features_df$Label = df_all[1:replace_rows, "Label"]
features_df$Subject = df_all[1:replace_rows, "subject"]
return(features_df)
}
HR_calc = function(df, win_size, shift) {
df <- df[rep(seq_len(nrow(df)), each = 4), ]
mu_vals = rollapply(df, width = win_size*4, by = shift, FUN = mean, align = "left")
sd_vals = rollapply(df, width = win_size*4, by = shift, FUN = sd, align = "left")
min_vals = rollapply(df, width = win_size*4, by = shift, FUN = min, align = "left")
max_vals = rollapply(df, width = win_size*4, by = shift, FUN = max, align = "left")
return(list(mu = mu_vals, sd = sd_vals,
min = min_vals, max = max_vals))
}
file_list <- list.files()
subject_data = file_list[grepl("df_S", file_list, fixed = TRUE)]
ALL_df = NULL
for (i in 1:length(subject_data)) {
print(i)
feature_data = features_all_modalities(subject_data[i], 5, 1) #CHANGE FILE PATH
subject_no = gsub("\\..*","", sub('.*_', '', subject_data[i]))
HR_fn = paste0("~/case-study-2/WESAD/", subject_no, "/", subject_no, "_E4_Data/HR.csv") #CHANGE FILEPATH to be : paste0("/hpc/group/sta440-f20/WESAD/WESAD/", subject_no, "/" )
HR_data = read.csv(HR_fn)[-1,]
hr = HR_calc(as.data.frame(HR_data), 5, 1)
df_hr = data.frame(matrix(unlist(hr), nrow= length(hr$mu),
ncol=4, byrow = F))
colnames(df_hr) = c("hr_wrist_mu","hr_wrist_sd", "hr_wrist_min", "hr_wrist_max")#, "hr_wrist_range")
df_hr$ID =seq.int(nrow(df_hr))
feature_data = feature_data[-c(1:40),]
feature_data$ID <- seq.int(nrow(feature_data))
S_df = merge(feature_data, df_hr, by="ID")
ALL_df = rbind(ALL_df, S_df)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
ALL_df = ALL_df %>% filter(Label %in% c("2","3"))
write.csv(ALL_df, "master10000_df.csv")
ALL_df = read.csv("master_df.csv", header = T)[,-c(1:2)]
# get rid of chest EDA and temp and all min max (underdispersion)
remove_select_cov = ALL_df %>%
dplyr::select(-contains("ACC_wrist")) %>%
dplyr::select(-EDA_mean, -EDA_sd, -EDA_max, -EDA_min) %>%
dplyr::select(-Temp_mean, -Temp_sd, -Temp_max, -Temp_min) %>%
dplyr::select(-contains("BVP")) %>%
dplyr::select(-contains("ECG")) %>%
dplyr::select(-contains("min")) %>%
dplyr::select(-contains("max")) %>%
dplyr::select(-contains("ACC_chest_X_mean")) %>%
dplyr::select(-contains("ACC_chest_Y_mean")) %>%
dplyr::select(-contains("ACC_chest_Z_mean")) %>%
dplyr::select(-contains("ACC_chest_3D_mean"))
# maybe reorder the variables
col_order <- c("ACC_chest_X_sd", "ACC_chest_Y_sd", "ACC_chest_Z_sd", "ACC_chest_3D_sd", "EMG_mean", "EMG_sd", "Resp_mean", "Resp_sd", "EDA_wrist_mean", "EDA_wrist_sd", "TEMP_wrist_mean", "TEMP_wrist_sd", "hr_wrist_mu", "hr_wrist_sd", "Subject", "Label")
remove_select_cov <- remove_select_cov[, col_order]
remove_select_cov$Subject <- factor(remove_select_cov$Subject, levels = c("S2", "S3","S4", "S5","S6", "S7","S8", "S9","S10", "S11","S13", "S14","S15", "S16","S17"))
remove_select_cov$Label <- ifelse(remove_select_cov$Label=="2", 1, 0)
non_biological = c("Label", "Subject")
remove_select_cov_nb = remove_select_cov[ , !(names(remove_select_cov) %in% non_biological)]
cormatrix = cor(remove_select_cov_nb)
corrplot::corrplot(cormatrix, method="color", addCoef.col="black", tl.cex = 0.4, number.cex= 10/ncol(remove_select_cov_nb), title="Fig. 1 Correlation Matrix of Engineered Features", mar=c(0,0,1,0))
From the correlation matrix (Fig. 1), we see that the
ACC variables (variables related to motion) are highly correlated. We also see some moderate correlation between Resp and EDA variables, as well as between EDA and TEMP variables.
After examining boxplots of the distribution of the physiological measures by state for each subject (see Appendix A.1 for all plots of this type), the variables that had the greatest difference in distribution between states and across subjects are
EDA_wrist_mean, Temp_wrist_mean, and hr_wrist_mu. For all subjects, mean EDA values (Fig. 2) are higher in the stress state, and for subjects 3, 5, 6, 7, 10, 13, the difference in the distribution of EDA between the two states is more drastic then that of the other subjects. For Temp_wrist_mean (Fig. 3), we can see that for some subjects, their mean temperature tends to be higher in the stress state but for others, their mean temperatue tends to be lower when stressed. Likewise, for hr_wrist_mu (Fig. 4), we see that some subjects have higher average heart rate when stressed while others exhibit the opposite behavior. Among all of the other variables, it appears that the motion (ACC) variables tend to show a higher distribtion for each subject under the stress state, and the EMG and Resp variables tend not to vary much in distribution between states across subjects.
For the wrist only data, we see similar correlation structures as before (with all of the data). Motion variables are strongly correlated with each other, and there is some correlation between
EDA and TEMP variables.
scaled_wrist_nb = scale(wrist_nb, center = TRUE, scale = TRUE)
res.pca <- PCA(scaled_wrist_nb, graph = FALSE, ncp = 4)
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.5700222 35.700222 35.70022
## Dim.2 1.6790330 16.790330 52.49055
## Dim.3 1.2389191 12.389191 64.87974
## Dim.4 1.0077038 10.077038 74.95678
## Dim.5 0.8810815 8.810815 83.76760
## Dim.6 0.7007898 7.007898 90.77550
## Dim.7 0.3410337 3.410337 94.18583
## Dim.8 0.2846091 2.846091 97.03192
## Dim.9 0.1789510 1.789510 98.82143
## Dim.10 0.1178566 1.178566 100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
In the quality of representation plot above (Fig. 6 shows the importance of a principal component for a given observation), we can see that the principal components (with number of components being 4) are about variability in motion, dermal temperature and activity, heart rate, and variability in temperature.
wrist_pca_vals <- as.data.frame(res.pca$ind$coord)
wrist_pca_vals$Label = wrist_only$Label
wrist_pca_vals$Subject = wrist_only$Subject
wrist_only_nsubj = wrist_only %>% dplyr::select(-Subject, -Label)
scaled_wrist_only = scale(wrist_only_nsubj, center=TRUE, scale = FALSE)
scaled_wrist_only = cbind(scaled_wrist_only, as.data.frame(wrist_only$Subject), as.data.frame(wrist_only$Label))
scaled_wrist_only = scaled_wrist_only %>%
rename(Subject = `wrist_only$Subject`) %>%
rename(Label = `wrist_only$Label`)
smp_size <- floor(0.75 * nrow(scaled_wrist_only))
train_ind <- sample(seq_len(nrow(scaled_wrist_only)), size = smp_size)
train_wrist <- scaled_wrist_only[train_ind, ]
test_wrist <- scaled_wrist_only[-train_ind, ]
# OUR MODEL FOR WRIST
wrist.full = glm(Label ~. -Subject, family = "binomial", data = train_wrist)
# summary(wrist.full)
#
# pred.test = wrist.full %>% predict(test_wrist, type= "response")
# pred.test = ifelse(pred.test>0.5, 1,0)
#
# mean(test_wrist$Label==pred.test)
# stepwise
step.wrist.model = wrist.full %>% stepAIC(trace=FALSE)
summary(step.wrist.model)
##
## Call:
## glm(formula = Label ~ ACC_wrist_x_sd + ACC_wrist_y_sd + ACC_wrist_z_sd +
## ACC_wrist_3D_sd + EDA_wrist_mean + EDA_wrist_sd + TEMP_wrist_mean +
## hr_wrist_mu + hr_wrist_sd, family = "binomial", data = train_wrist)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7400 -0.9264 0.1459 0.8409 2.0653
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.667489 0.026424 63.10 < 2e-16 ***
## ACC_wrist_x_sd -0.137729 0.008103 -17.00 < 2e-16 ***
## ACC_wrist_y_sd -0.161818 0.006344 -25.51 < 2e-16 ***
## ACC_wrist_z_sd 0.087019 0.006000 14.50 < 2e-16 ***
## ACC_wrist_3D_sd 0.793297 0.021369 37.12 < 2e-16 ***
## EDA_wrist_mean 0.284959 0.008532 33.40 < 2e-16 ***
## EDA_wrist_sd 11.659561 0.609005 19.14 < 2e-16 ***
## TEMP_wrist_mean -0.190817 0.009584 -19.91 < 2e-16 ***
## hr_wrist_mu -0.053886 0.001332 -40.46 < 2e-16 ***
## hr_wrist_sd -0.255107 0.061035 -4.18 2.92e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 60795 on 46622 degrees of freedom
## Residual deviance: 43312 on 46613 degrees of freedom
## AIC: 43332
##
## Number of Fisher Scoring iterations: 7
#convert training data to matrix format
train_wrist_x = train_wrist %>%
dplyr::select(-Label) %>%
mutate(Subject = as.factor(Subject))
x <- data.matrix(train_wrist_x)
y <- train_wrist$Label
lasso.fit = glmnet(x, y, alpha = 1, family = "binomial")
plot(lasso.fit)
elasticnet.fit = glmnet(x, y, alpha = 0.5, family = "binomial")
plot(elasticnet.fit)
# set.seed(123)
#
# kfoldcv_wrist_full = function(df) {
#
# actual_values = c()
# predicted_values = c()
# fitted_probs = c()
#
# # fold_accuracies = c()
# # fold_f1 = c()
#
# fold_no = c()
# subject = c()
#
# print("at folds")
# folds <- createFolds(factor(df$Subject), k = 5)
#
# print("past folds")
# for (i in 1:5) {
# #train and test
# col = paste0("Fold", i)
# indx = unlist(folds[col])
#
# train<- df[-indx,]
# print(nrow(train))
# test<- df[indx, ]
# print(nrow(test))
#
#
# # run model
# wrist.full = glm(Label ~. -Subject, family = "binomial", data = train)
# #summary(wrist.full)
#
# pred.test = wrist.full %>% predict(test, type= "response")
# fitted_probs = c(fitted_probs, pred.test)
# pred.test = ifelse(pred.test>0.5, 1,0)
# predicted_values = c(predicted_values, pred.test)
# actual_values = c(actual_values, test$Label)
#
#
# subject = c(subject, test$Subject)
# fold_no = c(fold_no, rep(i, nrow(test)))
#
#
#
# #acc1 = mean(test1$Label==pred.test)
# #fold_accuracies = c(fold_accuracies, acc1)
#
# # add function to compute f1
# # blah
# }
# #train on training set
# #test on testing set
# #predict labels
# #find accuracy and f1, add to fold_accuracies and fold_f1
# #put predicted values in predicted_values
# #put actual values in actual_values
#
# cvoutput = data.frame(cbind(actual_values, predicted_values, fitted_probs,
# fold_no, subject))
#
# return(cvoutput)
# }
#
# wristcvoutput = kfoldcv_wrist_full(scaled_wrist_only)
# fold1 = wristcvoutput %>%
# filter(subject=="S11")
#
# mean(fold1$actual_values==fold1$predicted_value)
scaled_select_cov_nb = scale(remove_select_cov_nb, center = TRUE, scale = TRUE)
res.pca <- PCA(scaled_select_cov_nb, graph = FALSE, ncp = 4)
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.89171906 27.7979933 27.79799
## Dim.2 1.63856781 11.7040558 39.50205
## Dim.3 1.26951655 9.0679754 48.57002
## Dim.4 1.06733384 7.6238132 56.19384
## Dim.5 1.02559562 7.3256830 63.51952
## Dim.6 1.00788303 7.1991645 70.71869
## Dim.7 0.91475232 6.5339451 77.25263
## Dim.8 0.81471460 5.8193900 83.07202
## Dim.9 0.70593320 5.0423800 88.11440
## Dim.10 0.63566406 4.5404576 92.65486
## Dim.11 0.33787458 2.4133899 95.06825
## Dim.12 0.32501719 2.3215513 97.38980
## Dim.13 0.29440169 2.1028692 99.49267
## Dim.14 0.07102645 0.5073318 100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))
From the plot above (Fig. 7), we see that the principal components are variability in movement (Dim.1), dermal temperature and activity (Dim.2), heart rate (Dim.3), and Neuro-muscular acitivity (Dim.4).
pca_vals <- as.data.frame(res.pca$ind$coord)
pca_vals$Label = remove_select_cov$Label
pca_vals$Subject = remove_select_cov$Subject
smp_size <- floor(0.75 * nrow(pca_vals))
train_ind <- sample(seq_len(nrow(pca_vals)), size = smp_size)
train_pca <- pca_vals[train_ind, ]
test_pca <- pca_vals[-train_ind, ]
# COMBO PCA MODEL model without Subject INTERPRET MAIN EFFECTS
pca.model = glm(Label ~ .-Subject, family = binomial(link="logit"), data=train_pca)
# summary(pca.model)
#
# pred.test = pca.model %>% predict(test_pca, type= "response")
# pred.test = ifelse(pred.test>0.5, 1,0)
#
# mean(test_pca$Label==pred.test)
est = round(summary(pca.model)$coefficients[,1],2)
SE = round(summary(pca.model)$coefficients[,2],2)
ci = round(confint(pca.model),2)
## Waiting for profiling to be done...
combo_model_tab = data.frame(cbind(est,SE,ci))
colnames(combo_model_tab) = c("Estimate", "SE", "2.5%", "97.5%")
kable(combo_model_tab, caption = "Estimate and Confidence Intervals for Coefficients in the All Data Model")
| Estimate | SE | 2.5% | 97.5% | |
|---|---|---|---|---|
| (Intercept) | 1.10 | 0.01 | 1.07 | 1.13 |
| Dim.1 | 1.00 | 0.01 | 0.97 | 1.02 |
| Dim.2 | 0.32 | 0.01 | 0.30 | 0.35 |
| Dim.3 | -0.35 | 0.01 | -0.38 | -0.33 |
| Dim.4 | -0.28 | 0.01 | -0.31 | -0.25 |
A increase of one in the standard deviation for movement and dermal temperature and activity is associated with a multiplicative increase in the odds that the state is stress. A increase of one in the standard deviation of heart rate and neuro-muscular activity is associated with a multiplicative decrease in the odds that the state is stress.
# stepwise
step.wrist.model = pca.model %>% stepAIC(trace=FALSE)
summary(step.wrist.model)
##
## Call:
## glm(formula = Label ~ (Dim.1 + Dim.2 + Dim.3 + Dim.4 + Subject) -
## Subject, family = binomial(link = "logit"), data = train_pca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7853 -0.8520 0.3303 0.8058 1.9831
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.10143 0.01490 73.90 <2e-16 ***
## Dim.1 0.99741 0.01178 84.68 <2e-16 ***
## Dim.2 0.32336 0.01273 25.41 <2e-16 ***
## Dim.3 -0.35209 0.01196 -29.43 <2e-16 ***
## Dim.4 -0.27894 0.01349 -20.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 60795 on 46622 degrees of freedom
## Residual deviance: 45272 on 46618 degrees of freedom
## AIC: 45282
##
## Number of Fisher Scoring iterations: 6
#convert training data to matrix format
train_pca_x = train_pca %>%
dplyr::select(-Label) %>%
mutate(Subject = as.factor(Subject))
x <- data.matrix(train_pca_x)
y <- train_pca$Label
lasso.fit = glmnet(x, y, alpha = 1, family = "binomial")
plot(lasso.fit)
elasticnet.fit = glmnet(x, y, alpha = 0.5, family = "binomial")
plot(elasticnet.fit)
set.seed(123)
kfoldcv_no_subject = function(df) {
actual_values = c()
predicted_values = c()
fitted_probs = c()
# fold_accuracies = c()
# fold_f1 = c()
fold_no = c()
subject = c()
#print("at folds")
folds <- createFolds(factor(df$Subject), k = 5)
#print("past folds")
for (i in 1:5) {
#train and test
col = paste0("Fold", i)
indx = unlist(folds[col])
train<- df[-indx,]
#print(nrow(train))
test<- df[indx, ]
#print(nrow(test))
# run model
pca.model = glm(Label ~ .-Subject, family = binomial(link="logit"), data=train)
#summary(wrist.full)
pred.test = pca.model %>% predict(test, type= "response")
fitted_probs = c(fitted_probs, pred.test)
pred.test = ifelse(pred.test>0.5, 1,0)
predicted_values = c(predicted_values, pred.test)
actual_values = c(actual_values, test$Label)
subject = c(subject, test$Subject)
fold_no = c(fold_no, rep(i, nrow(test)))
#acc1 = mean(test1$Label==pred.test)
#fold_accuracies = c(fold_accuracies, acc1)
# add function to compute f1
# blah
}
#train on training set
#test on testing set
#predict labels
#find accuracy and f1, add to fold_accuracies and fold_f1
#put predicted values in predicted_values
#put actual values in actual_values
cvoutput = data.frame(cbind(actual_values, predicted_values, fitted_probs,
fold_no, subject))
return(cvoutput)
}
combocvoutput = kfoldcv_no_subject(pca_vals)
wristcvoutput = kfoldcv_no_subject(wrist_pca_vals)
combo_acc_per_fold = combocvoutput %>%
group_by(fold_no) %>%
summarize(accuracy = mean(actual_values==predicted_values))
## `summarise()` ungrouping output (override with `.groups` argument)
combo_f1_per_fold = combocvoutput %>%
group_by(fold_no) %>%
summarize(f1 = F1_Score(actual_values, predicted_values, positive = NULL))
## `summarise()` ungrouping output (override with `.groups` argument)
wrist_acc_per_fold = wristcvoutput %>%
group_by(fold_no) %>%
summarize(accuracy = mean(actual_values==predicted_values))
## `summarise()` ungrouping output (override with `.groups` argument)
wrist_f1_per_fold = wristcvoutput %>%
group_by(fold_no) %>%
summarize(f1 = F1_Score(actual_values, predicted_values, positive = NULL))
## `summarise()` ungrouping output (override with `.groups` argument)
# REWMEMEVR TO ROUDN
acc_f1_data <- matrix(c(round(mean(combo_acc_per_fold$accuracy),2),
round(mean(wrist_acc_per_fold$accuracy),2),
round(mean(combo_f1_per_fold$f1),2),
round(mean(wrist_f1_per_fold$f1),2)),ncol=2,byrow=TRUE)
colnames(acc_f1_data) <- c("All","Wrist")
rownames(acc_f1_data) <- c("Accuracy","F1")
acc_f1_tab <- as.table(acc_f1_data)
kable(acc_f1_tab, caption = "Accuracy and F1 for All Data and Wrist Models")
| All | Wrist | |
|---|---|---|
| Accuracy | 0.81 | 0.74 |
| F1 | 0.73 | 0.59 |
# add row and col names or get rid of kable
combo_cm = as.table(confusionMatrix(as.factor(combocvoutput$predicted_values), as.factor(combocvoutput$actual_values)))
colnames(combo_cm) = c("Actual Amusement", "Actual Stress")
rownames(combo_cm) = c("Predicted Amusement", "Predicted Stress")
#kable(combo_cm, title = "Confusion Matrix for All Data Model")
wrist_cm = as.table(confusionMatrix(as.factor(wristcvoutput$predicted_values), as.factor(wristcvoutput$actual_values)))
colnames(wrist_cm) = c("Actual Amusement", "Actual Stress")
rownames(wrist_cm) = c("Predicted Amusement", "Predicted Stress")
#kable(wrist_cm, title = "Confusion Matrix for Wrist Data Model")
knitr::kable(list(combo_cm, wrist_cm), title = "Confusion Matrices for All Data (left) and Wrist (right) Models")
|
|
In Table 1 (and in the confusion matrices Table 2), we see the average accuracy and F1 results of 5-folds cross validation for a model using only wrist data and a model using both chest and wrist data. The model using both types of sensor data has higher accuracy and F1.
# HOPEFULLY THE RIGHT ONE model with no main effect for Subject but with interactions
pca.model.interact = glm(Label ~ Dim.1+Dim.2+Dim.3+Dim.4+Dim.1:Subject
+Dim.2:Subject
+Dim.3:Subject
+Dim.4:Subject
, family = binomial(link="logit"), data=pca_vals)
summary(pca.model.interact)
##
## Call:
## glm(formula = Label ~ Dim.1 + Dim.2 + Dim.3 + Dim.4 + Dim.1:Subject +
## Dim.2:Subject + Dim.3:Subject + Dim.4:Subject, family = binomial(link = "logit"),
## data = pca_vals)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6026 -0.2067 0.0238 0.4043 3.1446
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.75764 0.04035 68.345 < 2e-16 ***
## Dim.1 0.31351 0.08606 3.643 0.000270 ***
## Dim.2 -8.43530 0.41164 -20.492 < 2e-16 ***
## Dim.3 -1.60165 0.07041 -22.746 < 2e-16 ***
## Dim.4 0.40475 0.08485 4.770 1.84e-06 ***
## Dim.1:SubjectS3 1.27153 0.09933 12.801 < 2e-16 ***
## Dim.1:SubjectS4 1.74004 0.09940 17.506 < 2e-16 ***
## Dim.1:SubjectS5 4.43791 0.20646 21.495 < 2e-16 ***
## Dim.1:SubjectS6 5.26245 0.36695 14.341 < 2e-16 ***
## Dim.1:SubjectS7 5.00461 0.22440 22.302 < 2e-16 ***
## Dim.1:SubjectS8 1.99914 0.09962 20.069 < 2e-16 ***
## Dim.1:SubjectS9 0.61708 0.09100 6.781 1.19e-11 ***
## Dim.1:SubjectS10 1.02093 0.10497 9.726 < 2e-16 ***
## Dim.1:SubjectS11 3.25439 0.20597 15.801 < 2e-16 ***
## Dim.1:SubjectS13 0.18959 0.08811 2.152 0.031420 *
## Dim.1:SubjectS14 1.43407 0.09598 14.942 < 2e-16 ***
## Dim.1:SubjectS15 1.09561 0.09425 11.624 < 2e-16 ***
## Dim.1:SubjectS16 2.03518 0.11104 18.328 < 2e-16 ***
## Dim.1:SubjectS17 0.29241 0.09340 3.131 0.001745 **
## Dim.2:SubjectS3 12.02063 0.41988 28.629 < 2e-16 ***
## Dim.2:SubjectS4 12.58803 0.43595 28.875 < 2e-16 ***
## Dim.2:SubjectS5 16.02306 0.60414 26.522 < 2e-16 ***
## Dim.2:SubjectS6 33.04259 1.70286 19.404 < 2e-16 ***
## Dim.2:SubjectS7 6.56933 0.46842 14.025 < 2e-16 ***
## Dim.2:SubjectS8 8.62471 0.43988 19.607 < 2e-16 ***
## Dim.2:SubjectS9 12.13523 0.42372 28.640 < 2e-16 ***
## Dim.2:SubjectS10 10.85709 0.45676 23.770 < 2e-16 ***
## Dim.2:SubjectS11 20.90306 0.69993 29.865 < 2e-16 ***
## Dim.2:SubjectS13 8.30807 0.41217 20.157 < 2e-16 ***
## Dim.2:SubjectS14 10.48645 0.42848 24.473 < 2e-16 ***
## Dim.2:SubjectS15 8.94947 0.41727 21.448 < 2e-16 ***
## Dim.2:SubjectS16 9.48707 0.41735 22.732 < 2e-16 ***
## Dim.2:SubjectS17 9.70360 0.42230 22.978 < 2e-16 ***
## Dim.3:SubjectS3 0.85803 0.10300 8.331 < 2e-16 ***
## Dim.3:SubjectS4 1.56018 0.09992 15.615 < 2e-16 ***
## Dim.3:SubjectS5 6.08450 0.28912 21.045 < 2e-16 ***
## Dim.3:SubjectS6 3.30071 0.32375 10.195 < 2e-16 ***
## Dim.3:SubjectS7 -0.67064 0.23745 -2.824 0.004738 **
## Dim.3:SubjectS8 1.31896 0.13318 9.904 < 2e-16 ***
## Dim.3:SubjectS9 2.82131 0.11328 24.906 < 2e-16 ***
## Dim.3:SubjectS10 -1.41101 0.10794 -13.072 < 2e-16 ***
## Dim.3:SubjectS11 0.61919 0.18121 3.417 0.000633 ***
## Dim.3:SubjectS13 -0.91241 0.10440 -8.740 < 2e-16 ***
## Dim.3:SubjectS14 -0.97794 0.15333 -6.378 1.79e-10 ***
## Dim.3:SubjectS15 0.30891 0.10075 3.066 0.002169 **
## Dim.3:SubjectS16 1.42710 0.09288 15.365 < 2e-16 ***
## Dim.3:SubjectS17 3.58877 0.10381 34.572 < 2e-16 ***
## Dim.4:SubjectS3 -1.58708 0.10145 -15.644 < 2e-16 ***
## Dim.4:SubjectS4 -1.83509 0.12602 -14.562 < 2e-16 ***
## Dim.4:SubjectS5 -2.03910 0.16427 -12.413 < 2e-16 ***
## Dim.4:SubjectS6 0.01843 0.43488 0.042 0.966200
## Dim.4:SubjectS7 1.92784 0.26413 7.299 2.90e-13 ***
## Dim.4:SubjectS8 -2.03017 0.11333 -17.914 < 2e-16 ***
## Dim.4:SubjectS9 -0.60533 0.09967 -6.073 1.25e-09 ***
## Dim.4:SubjectS10 -0.10198 0.11314 -0.901 0.367421
## Dim.4:SubjectS11 -1.79669 0.17621 -10.196 < 2e-16 ***
## Dim.4:SubjectS13 -0.92594 0.09385 -9.866 < 2e-16 ***
## Dim.4:SubjectS14 -0.09012 0.10881 -0.828 0.407515
## Dim.4:SubjectS15 -0.45976 0.09844 -4.670 3.01e-06 ***
## Dim.4:SubjectS16 -1.72068 0.12083 -14.240 < 2e-16 ***
## Dim.4:SubjectS17 -0.50899 0.09819 -5.184 2.18e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 81147 on 62163 degrees of freedom
## Residual deviance: 31344 on 62103 degrees of freedom
## AIC: 31466
##
## Number of Fisher Scoring iterations: 12
pred.test = pca.model.interact %>% predict(pca_vals, type= "response")
pca_vals$fitted = pred.test
pred.test = ifelse(pred.test>0.5, 1,0)
pca_vals$predicted = pred.test
#
# mean(test_pca$Label==pred.test)
#
# F1_Score(test_pca$Label, pred.test, positive = NULL)
# confusionMatrix(as.factor(pred.test), as.factor(test_pca$Label))
set.seed(123)
kfoldcv_final = function(df) {
actual_values = c()
predicted_values = c()
fitted_probs = c()
# fold_accuracies = c()
# fold_f1 = c()
fold_no = c()
subject = c()
print("at folds")
folds <- createFolds(factor(df$Subject), k = 5)
print("past folds")
for (i in 1:5) {
#train and test
col = paste0("Fold", i)
indx = unlist(folds[col])
train<- df[-indx,]
print(nrow(train))
test<- df[indx, ]
print(nrow(test))
# run model
pca.model.interact = glm(Label ~ Dim.1+Dim.2+Dim.3+Dim.4+Dim.1:Subject
+Dim.2:Subject
+Dim.3:Subject
+Dim.4:Subject
, family = binomial(link="logit"), data=train)
#summary(wrist.full)
pred.test = pca.model.interact %>% predict(test, type= "response")
fitted_probs = c(fitted_probs, pred.test)
pred.test = ifelse(pred.test>0.5, 1,0)
predicted_values = c(predicted_values, pred.test)
actual_values = c(actual_values, test$Label)
subject = c(subject, test$Subject)
fold_no = c(fold_no, rep(i, nrow(test)))
#acc1 = mean(test1$Label==pred.test)
#fold_accuracies = c(fold_accuracies, acc1)
# add function to compute f1
# blah
}
#train on training set
#test on testing set
#predict labels
#find accuracy and f1, add to fold_accuracies and fold_f1
#put predicted values in predicted_values
#put actual values in actual_values
cvoutput = data.frame(cbind(actual_values, predicted_values, fitted_probs,
fold_no, subject))
return(cvoutput)
}
finalcvoutput = kfoldcv_final(pca_vals)
## [1] "at folds"
## [1] "past folds"
## [1] 49729
## [1] 12435
## [1] 49732
## [1] 12432
## [1] 49732
## [1] 12432
## [1] 49731
## [1] 12433
## [1] 49732
## [1] 12432
#MALAVI ADD FOR SUBJECT
final_acc_per_fold = finalcvoutput %>%
group_by(fold_no) %>%
summarize(accuracy = mean(actual_values==predicted_values))
## `summarise()` ungrouping output (override with `.groups` argument)
final_f1_per_fold = finalcvoutput %>%
group_by(fold_no) %>%
summarize(f1 = F1_Score(actual_values, predicted_values, positive = NULL))
## `summarise()` ungrouping output (override with `.groups` argument)
final_acc_per_subj = finalcvoutput %>%
group_by(subject) %>%
summarize(accuracy = mean(actual_values==predicted_values))
## `summarise()` ungrouping output (override with `.groups` argument)
final_f1_per_subj = finalcvoutput %>%
group_by(subject) %>%
summarize(f1 = F1_Score(actual_values, predicted_values, positive = NULL))
## `summarise()` ungrouping output (override with `.groups` argument)
# REWMEMEVR TO ROUDN
# acc_f1_data <- matrix(c(mean(combo_acc_per_fold$accuracy),
# mean(wrist_acc_per_fold$accuracy),
# "", "",
# mean(combo_f1_per_fold$f1),
# mean(wrist_f1_per_fold$f1)),ncol=2,byrow=TRUE)
# # colnames(acc_f1_data) <- c("Combo","Wrist")
# # rownames(acc_f1_data) <- c("Accuracy","F1")
# acc_f1_tab <- as.table(acc_f1_data)
# library(broom)
# library(knitr)
# kable(acc_f1_tab, caption = "title")
| Actual Amusement | Actual Stress | |
|---|---|---|
| Predicted Amusement | 11608 | 5605 |
| Predicted Stress | 10692 | 34259 |
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
The ROC curve (Fig. 8) shows that our final model (using both chest and wrist data and including interactions for Subject) discriminates moderately well between the stress and amusement states. Specifically, the area under the curve is 0.8874232.
# ggplot(data = pca_vals %>% filter(Subject %in% c("S4", "S9", "S13", "S17")), aes(x = Dim.1, y = predicted, color = as.factor(Subject))) +
# #geom_point() +
# geom_jitter(width = 0.2, height = 0.1, size = 0.3) +
# #geom_smooth(method = 'glm', method.args = list(family = 'binomial'),
# # size = 2, se = F, aes(color = as.factor(Subject))) +
# geom_smooth(method = 'glm', method.args = list(family = 'binomial'),
# size = 0.5, se = F, fullrange = T, aes(color = as.factor(Subject))) +
# #geom_dl(aes(label = Subject), method = list(dl.combine("first.points", "last.points")))+
# labs(color = "Subject", y = "Pr(Amusement State)") +
# ylim(0,1)
#
# ggplot(data = pca_vals , aes(x = Dim.1, y = predicted, color = as.factor(Subject))) +
# #geom_point() +
# geom_jitter(width = 0.2, height = 0.1, size = 0.3) +
# #geom_smooth(method = 'glm', method.args = list(family = 'binomial'),
# # size = 2, se = F, aes(color = as.factor(Subject))) +
# geom_smooth(method = 'glm', method.args = list(family = 'binomial'),
# size = 0.5, se = F, fullrange = T, aes(color = as.factor(Subject))) +
# geom_dl(aes(label = Subject), method = list(dl.combine("first.points", "last.points")))+
# labs(color = "Subject", y = "Pr(Amusement State)") +
# ylim(0,1)
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
As the variation in motion increases, the fitted probabilities (where stress is 1) increases similarly for all subjects except for S13 and S2. We note for S13 and S2 that as the variability of motion becomes gerater, the fitted probability of stress changes increase by a smaller amount. However, all subject exhibit a similar increasing trend. This behavior can be corroborated by Table [][][] in Appendix [][][]. As the variation in mean dermal temperature and activity increases, the fitted probabilities (where stress is 1) increases similarly for all subjects except for S2, S7, and S13. For these three subjects, the fitted probabilities decrease This behavior can be corroborated by Table [][][] in Appendix [][][].
p3 = plot_model(pca.model.interact, type = "pred", terms = c("Dim.3 [all]", "Subject")) +
scale_fill_manual(values = mycolors) +
scale_y_continuous(name="Fitted Probabilites", limits=c(0, 1)) +
labs(title = "Fig. 11 Marginal Effects of Heart Rate")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
p4 = plot_model(pca.model.interact, type = "pred", terms = c("Dim.4 [all]", "Subject")) +
scale_fill_manual(values = mycolors) +
scale_y_continuous(name="Fitted Probabilites", limits=c(0, 1)) +
labs(title = "Fig. 12 Marginal Effects of Neuro-Muscular Activity")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
grid.arrange(p3, p4, nrow = 1)
As the variation in mean heart rate increases, the fitted probabilities (where stress is 1) decreases similarly for all subjects except for S5, S6, S9, and S17. For these four subjects, the fitted probabilities increase. This behavior can be corroborated by Table [][][] in Appendix [][][].
malavi Weird people: 6,7,2,14,10
varvslab<- function(var) {
ggplot(remove_select_cov, aes_(y=as.name(var), x=factor(remove_select_cov$Label), group = factor(remove_select_cov$Label))) + geom_boxplot() + facet_grid(~remove_select_cov$Subject) +
labs(x ="State", y = as.name(var))
}
do.call(grid.arrange, lapply(names(remove_select_cov_nb), varvslab))
# IBI_data_3 = read.csv("~/case-study-2/WESAD/S3/S3_E4_Data/IBI.csv")
#
# RMSSD_calc = function(df, peaks) {
# # for (i in 1:nrow(df)) {
# # if (i <= nrow(df)-peaks)
# vals = rollapply(df[,2], width = peaks, by = 1, FUN = rmssd, align = "left")
# #}
#
# return(vals)
# }
#
# hrv = RMSSD_calc(IBI_data_3, 3)
# IBI_data_3[1:length(hrv),3] = hrv
# colnames(IBI_data_3)[3] = "HRV"
# IBI_data_3 = na.omit(IBI_data_3)
# hz_format = function(df, win_size) {
# mu_vals = NULL
# sd_vals = NULL
#
# for (i in 1:nrow(df)) {
# #if (i <= nrow(df)-win_size)
# nexttime = df[i,1] + win_size
# # browser()
#
# #print(nexttime)
# if (i+1 <= nrow(df)) {
#
# for (j in i+1:nrow(df)) {
# # print(df[j,1])
# if (!is.na(df[j,1])) {
# # print(df[j,1]==nexttime)
# # if (df[j,1]==nexttime) {
# # stoprow = j
# # }
# # print(i)
# # print(j)
# # print(df[j,1])
# # print(nexttime)
# if ((df[j,1] >= nexttime) & (df[j-1,1]<nexttime)) {
# stoprow = j-1
# }
# }
# }
# mu_vals = c(mu_vals, mean(df[i:stoprow, 3]))
# sd_vals = c(sd_vals, sqrt(var(df[i:stoprow, 3])))
#
# }
# }
#
# return(list(hrv_mu = mu_vals, hrv_sd = sd_vals))
# }
# test = hz_format(IBI_data_3, 5)